Loading Library

library(RColorBrewer)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.0
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggplot2)
library(readr)
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(tidyverse)
library(stringr)
library(dplyr)
library(formattable)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:formattable':
## 
##     style
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(readr)
library(htmltools)
library(htmlwidgets)
bin148 <- read_csv("bin148_topfiftyhits.txt", 
                   col_names = c("qseqid", "sseqid", "evalue", "staxids", "sscinames", "sskingdoms", "stitle"))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 7768 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): qseqid, sseqid, staxids, sscinames, sskingdoms, stitle
## dbl (1): evalue
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bin148_eval <- bin148 %>% 
  filter(as.numeric(evalue) <= 1e-10)
bin148_kingdomcount <- bin148_eval %>% 
  count(sskingdoms)
bin148_kingdomcount
## # A tibble: 5 × 2
##   sskingdoms     n
##   <chr>      <int>
## 1 Archaea      117
## 2 Bacteria    1739
## 3 Eukaryota   1050
## 4 N/A            3
## 5 Viruses     1566
piechart_bin148 <- ggplot(bin148_kingdomcount, aes(x="", y=n, fill=sskingdoms)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) +
  scale_fill_brewer(palette = "RdPu") +
   theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        panel.grid  = element_blank())
piechart_bin148

bin148_rmduplicates <- bin148_eval %>% 
  group_by(qseqid) %>% 
  distinct(sseqid, .keep_all = TRUE)
bin148_eukaryotes <- bin148_rmduplicates %>%
  filter(sskingdoms == "Eukaryota") %>%
  group_by(qseqid, sseqid, staxids) %>%
  slice_min(order_by = evalue) %>%
  ungroup() %>% 
  mutate(genus = word(sscinames, 1))
bin148_eukaryotes
## # A tibble: 714 × 8
##    qseqid             sseqid         evalue staxids sscin…¹ sskin…² stitle genus
##    <chr>              <chr>           <dbl> <chr>   <chr>   <chr>   <chr>  <chr>
##  1 Contig103910588_1  dbj|GFR915… 7.64e- 73 1093978 Elysia… Eukary… CAP-s… Elys…
##  2 Contig103910588_1  emb|CAF097… 1.77e- 39 104777  Brachi… Eukary… unnam… Brac…
##  3 Contig103910588_1  gb|KAG2378… 3.7 e- 40 51637   Naegle… Eukary… hypot… Naeg…
##  4 Contig103910588_1  gb|KAH3765… 1.45e- 42 1580589 Pelomy… Eukary… poly … Pelo…
##  5 Contig103910588_1  gb|KJE9221… 9.59e- 40 595528  Capsas… Eukary… hypot… Caps…
##  6 Contig103910588_1  ref|XP_004… 3.06e- 40 595528  Capsas… Eukary… hypot… Caps…
##  7 Contig103910588_1  ref|XP_013… 5.36e- 39 461836  Thecam… Eukary… hypot… Thec…
##  8 Contig103910588_1  ref|XP_044… 1.41e- 37 51637   Naegle… Eukary… uncha… Naeg…
##  9 Contig103910588_10 dbj|GFR915… 5.76e-148 1093978 Elysia… Eukary… ribon… Elys…
## 10 Contig103910588_10 emb|CAB337… 5.84e-103 197152  Cloeon… Eukary… Hypot… Cloe…
## # … with 704 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms
# Count the number of times each genus appears for a specific protein sequence
bin148_egenus_counts <- bin148_eukaryotes %>%
  group_by(qseqid, genus) %>%
  summarise(count = n(), .groups = "keep") %>% 
  filter(count > 1) # Only include those with more than one count 
print(bin148_egenus_counts)
## # A tibble: 102 × 3
## # Groups:   qseqid, genus [102]
##    qseqid             genus             count
##    <chr>              <chr>             <int>
##  1 Contig103910588_1  Capsaspora            2
##  2 Contig103910588_1  Naegleria             2
##  3 Contig103910588_10 Danio                 2
##  4 Contig103910588_10 Drosophila           18
##  5 Contig103910588_10 Penaeus               4
##  6 Contig103910588_10 Takifugu              2
##  7 Contig103910588_4  Rhizophagus           6
##  8 Contig103910588_5  Albugo                2
##  9 Contig103910588_5  Aphanomyces          18
## 10 Contig103910588_5  Peronosclerospora     2
## # … with 92 more rows
# Count the number of times each genus appears for a specific protein sequence
bin148_egenus_counts_3 <- bin148_eukaryotes %>%
  group_by(qseqid, genus) %>%
  summarise(count = n(), .groups = "keep") %>% 
  filter(count > 1) # Only include those with more than one count 
print(bin148_egenus_counts_3)
## # A tibble: 102 × 3
## # Groups:   qseqid, genus [102]
##    qseqid             genus             count
##    <chr>              <chr>             <int>
##  1 Contig103910588_1  Capsaspora            2
##  2 Contig103910588_1  Naegleria             2
##  3 Contig103910588_10 Danio                 2
##  4 Contig103910588_10 Drosophila           18
##  5 Contig103910588_10 Penaeus               4
##  6 Contig103910588_10 Takifugu              2
##  7 Contig103910588_4  Rhizophagus           6
##  8 Contig103910588_5  Albugo                2
##  9 Contig103910588_5  Aphanomyces          18
## 10 Contig103910588_5  Peronosclerospora     2
## # … with 92 more rows
distinct_colors <- c("skyblue1", "#E6F5D0", "#B8E186", "#7FBC41", "#4D9221", "#276419", "#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33", "#A65628", "#F781BF", "#999999", "#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A", "#FFFF99", "#B15928", "#FBB4AE", "#B3CDE3", "#CCEBC5", "#DECBE4", "#FED9A6", "midnightblue", "#E5D8BD", "cornflowerblue", "#8DD3C7", "royalblue", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "darkcyan", "#D9D9D9", "#BC80BD", "#CCEBC5", "chartreuse2", "#7FC97F", "#BEAED4", "purple", "cadetblue", "#386CB0", "#F0027F", "#BF5B17", "#666666", "#8E0152", "#C51B7D", "#DE77AE", "#F1B6DA", "#FDE0EF", "lightpink", "lightgreen", "mediumpurple1", "lightgoldenrod2", "lavender", "lavenderblush1", "lightblue2", "lightcyan1", "lemonchiffon2", "lawngreen", "khaki3", "ivory2", "indianred2", "hotpink2", "honeydew2", "mistyrose1", "midnightblue", "mediumvioletred", "mediumslateblue", "mediumpurple3", "mediumorchid1", "maroon2", "maroon", "limegreen", "lightsteelblue", "lightskyblue", "lightseagreen", "lightsalmon", "lightpink2", "pink", "peachpuff", "palevioletred1", "paleturquoise", "palegreen", "orchid1", "orange1", "olivedrab1", "purple2", "plum1", "steelblue1", "royalblue1", "skyblue",  "coral1", "chartreuse2", "darkgreen", "deepskyblue", "cornflowerblue", "darkcyan", "moccasin", "skyblue1", "#E6F5D0", "#B8E186", "#7FBC41", "#4D9221", "#276419", "#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33", "#A65628", "#F781BF", "#999999", "#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A", "#FFFF99", "#B15928", "#FBB4AE", "#B3CDE3", "#CCEBC5", "#DECBE4", "#FED9A6", "midnightblue", "#E5D8BD", "cornflowerblue", "#8DD3C7", "royalblue", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "darkcyan", "#D9D9D9", "#BC80BD", "#CCEBC5", "chartreuse2", "#7FC97F", "#BEAED4", "purple", "cadetblue", "#386CB0", "#F0027F", "#BF5B17", "#666666", "#8E0152", "#C51B7D", "#DE77AE", "#F1B6DA", "#FDE0EF", "lightpink", "lightgreen", "mediumpurple1", "lightgoldenrod2", "lavender", "lavenderblush1", "lightblue2", "lightcyan1", "lemonchiffon2", "lawngreen", "khaki3", "ivory2", "indianred2", "hotpink2", "honeydew2", "mistyrose1", "midnightblue", "mediumvioletred", "mediumslateblue", "mediumpurple3", "mediumorchid1", "maroon2", "maroon", "limegreen", "lightsteelblue", "lightskyblue", "lightseagreen", "lightsalmon", "lightpink2", "pink", "peachpuff", "palevioletred1", "paleturquoise", "palegreen", "orchid1", "orange1", "olivedrab1", "purple2", "plum1", "steelblue1", "royalblue1", "skyblue",  "coral1", "chartreuse2", "darkgreen", "deepskyblue", "cornflowerblue", "darkcyan", "moccasin")
bin148_eukaryotes_ggplot_genusFreq <- ggplot(bin148_egenus_counts_3, aes(x = genus, y = count)) +
  geom_bar(stat = "identity", width = 0.7, position = "stack") +
  scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
  theme_minimal() +
  theme(text = element_text(size = 20), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
bin148_eukaryotes_ggplot_genusFreq

bin148_eukaryotes_ggplot <- ggplot(bin148_egenus_counts_3, aes(x = qseqid, y = count, fill = genus)) +
  geom_bar(stat = "identity", width = 0.7, position = "stack") +
  scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
  theme_minimal() +
  scale_fill_manual(values = distinct_colors) +
  theme(text = element_text(size = 25), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
bin148_eukaryotes_ggplot

#### Using ggsave to save .png formats of ggplots. ggsave(“bin148_genusFreq_qseqid.png”, plot = bin148_eukaryotes_ggplot, width = 20, height = 10)

ggplotly(bin148_eukaryotes_ggplot)

Viruses

library(DT)
bin148_viruses <- bin148_rmduplicates %>% 
  filter(sskingdoms == "Viruses")
bin148_viruses
## # A tibble: 1,466 × 7
## # Groups:   qseqid [60]
##    qseqid          sseqid                evalue staxids   sscin…¹ sskin…² stitle
##    <chr>           <chr>                  <dbl> <chr>     <chr>   <chr>   <chr> 
##  1 Contig2728381_4 gb|WEG42285.1|      5.98e-53 10497     Africa… Viruses BA71V…
##  2 Contig2728381_4 sp|P0CAC2.1|        7.46e-53 10500     Africa… Viruses RecNa…
##  3 Contig2728381_4 gb|QRY19113.1|      9.61e-53 10497     Africa… Viruses BA71V…
##  4 Contig2728381_4 ref|YP_009702820.1| 1.24e-52 10497;56… Africa… Viruses BA71V…
##  5 Contig2728381_4 ref|YP_009702985.1| 1.7 e-52 10497;29… Africa… Viruses BA71V…
##  6 Contig2728381_4 ref|YP_009927211.1| 1.81e-52 10497     Africa… Viruses hypot…
##  7 Contig2728381_4 gb|QID21254.1|      2.67e-52 10497     Africa… Viruses pB263…
##  8 Contig2728381_4 gb|UNZ12432.1|      2.73e-52 10497     Africa… Viruses B263R…
##  9 Contig2728381_4 gb|UPH95742.1|      2.76e-52 10497     Africa… Viruses B263R…
## 10 Contig2728381_4 gb|QGM12922.2|      3.74e-52 10497     Africa… Viruses pB263…
## # … with 1,456 more rows, and abbreviated variable names ¹​sscinames,
## #   ²​sskingdoms
datatable(bin148_viruses)